{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ab54b2d3-5581-49af-bbad-03f24c5f2e7e",
      "metadata": {
        "id": "ab54b2d3-5581-49af-bbad-03f24c5f2e7e"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import os, sys, csv, re, math\n",
        "from tqdm import tqdm\n",
        "import sec_parser as sp\n",
        "from sec_parser.semantic_elements.table_element.table_parser import TableParser\n",
        "import warnings, time, threading\n",
        "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, TimeoutError, as_completed\n",
        "from collections import Counter, deque\n",
        "from datetime import datetime\n",
        "import nltk\n",
        "from nltk.corpus import words\n",
        "import numpy as np\n",
        "\n",
        "nltk.download('words')\n",
        "\n",
        "english_words = set(words.words())\n",
        "\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "def log_print(message):\n",
        "    with open(\"logfile.log\", \"a\") as log_file:\n",
        "        print(message, file=log_file)  # writes message to the log file"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2e9b9d5d-342c-40f8-9e8e-888a43ec55c1",
      "metadata": {
        "id": "2e9b9d5d-342c-40f8-9e8e-888a43ec55c1"
      },
      "outputs": [],
      "source": [
        "# Generate A&L word distribution\n",
        "csv_files = [os.path.join(\"output_al_train\", f) for f in os.listdir(\"output_al_train\") if f.endswith(\".csv\")]\n",
        "\n",
        "words_dist = Counter()\n",
        "\n",
        "pbar_lock = threading.Lock()\n",
        "data_lock = threading.Lock()\n",
        "\n",
        "def process_file(file):\n",
        "    global data_lock\n",
        "    global words_dist\n",
        "\n",
        "    with open(file, 'r') as f:\n",
        "        reader = csv.reader(f)\n",
        "        for row in reader:\n",
        "            for cell in row:\n",
        "                cell = cell.strip().lower()\n",
        "                if cell == '':\n",
        "                    continue\n",
        "                if not any(char.isdigit() for char in cell):  # Check if the cell contains any digit\n",
        "                    # Tokenize the cell and check if all tokens are English words\n",
        "                    if any(word in english_words for word in cell.split(\" \")):\n",
        "                        with data_lock:\n",
        "                            words_dist[cell] += 1  # Increment the count for the word\n",
        "\n",
        "\n",
        "with tqdm(total=len(csv_files), file=sys.stdout) as pbar:\n",
        "    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:\n",
        "        futures = {executor.submit(process_file, csv_file): csv_file for csv_file in csv_files}\n",
        "        for future in as_completed(futures):\n",
        "            try:\n",
        "                future.result()\n",
        "            except Exception as e:\n",
        "                print(f\"Error processing file {futures[future]}: {e}\")\n",
        "            finally:\n",
        "                with pbar_lock:\n",
        "                    pbar.update(1)\n",
        "\n",
        "total_count = sum(words_dist.values())\n",
        "words_dist = Counter({key: value / total_count for key, value in words_dist.items()})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "481ff1b1-0971-4f0d-ab4f-817a2a264667",
      "metadata": {
        "id": "481ff1b1-0971-4f0d-ab4f-817a2a264667"
      },
      "outputs": [],
      "source": [
        "def compute_cosine_similarity(counter1, counter2):\n",
        "    # Calculate the dot product of the two counters\n",
        "    dot_product = sum(counter1[word] * counter2[word] for word in set(counter1) & set(counter2))\n",
        "\n",
        "    # Calculate the magnitudes of the two counters\n",
        "    magnitude1 = math.sqrt(sum(counter1[word] ** 2 for word in counter1))\n",
        "    magnitude2 = math.sqrt(sum(counter2[word] ** 2 for word in counter2))\n",
        "\n",
        "    # Avoid division by zero\n",
        "    if magnitude1 == 0 or magnitude2 == 0:\n",
        "        return 0\n",
        "\n",
        "    # Calculate the cosine similarity\n",
        "    cosine_similarity = dot_product / (magnitude1 * magnitude2)\n",
        "\n",
        "    return cosine_similarity\n",
        "\n",
        "def apply_units(df, unit_multiplier):\n",
        "    # Function to try and convert and scale a value\n",
        "    def try_convert_and_scale(value):\n",
        "        try:\n",
        "            val = float(value)\n",
        "            if val < 100_000:\n",
        "                return val * unit_multiplier\n",
        "            else:\n",
        "                return val\n",
        "        except ValueError:\n",
        "            return value\n",
        "\n",
        "    # Apply the function to each cell in the DataFrame\n",
        "    df = df.applymap(try_convert_and_scale)\n",
        "\n",
        "    return df\n",
        "\n",
        "def find_units(table_node) -> int:\n",
        "    units_pattern = re.compile(r'thousand|million|billion', re.IGNORECASE)\n",
        "    multipliers = {'thousand': 1e3, 'million': 1e6, 'billion': 1e9}\n",
        "\n",
        "    prev_siblings_text = ' '.join(sibling.text for sibling in table_node.parent.children)\n",
        "    units_match = units_pattern.search(prev_siblings_text)\n",
        "\n",
        "    units = multipliers.get(units_match.group(0).lower(), 1) if units_match else 1\n",
        "    return units\n",
        "\n",
        "def clean_df(table_node):\n",
        "    units = find_units(table_node)\n",
        "    table_parser = TableParser(table_node.semantic_element.get_source_code().replace(\"$\", \"\"))\n",
        "    df = table_parser.parse_as_df()\n",
        "    return apply_units(df, units) if units > 1 else df\n",
        "\n",
        "def score_table(table):\n",
        "    global words_dist\n",
        "    table_dist = Counter()\n",
        "\n",
        "    for cell in table.values.flatten():\n",
        "        if isinstance(cell, str) and not any(char.isdigit() for char in cell):\n",
        "            cell = cell.strip().lower()\n",
        "            if cell and any(word in english_words for word in cell.split()):\n",
        "                table_dist[cell] += 1\n",
        "\n",
        "    total_count = sum(words_dist.values())\n",
        "    table_dist = Counter({key: value / total_count for key, value in table_dist.items()})\n",
        "\n",
        "    return compute_cosine_similarity(words_dist, table_dist)\n",
        "\n",
        "def find_table(tree):\n",
        "    best_score = 0\n",
        "    best_table = None\n",
        "    queue = deque([child for child in tree])\n",
        "\n",
        "    while queue:\n",
        "        node = queue.popleft()\n",
        "        if isinstance(node.semantic_element, sp.TableElement):\n",
        "            df = clean_df(node)\n",
        "            score = score_table(df)\n",
        "            if score > best_score:\n",
        "                best_score, best_table = score, df\n",
        "        queue.extend(node.children)\n",
        "    return best_table\n",
        "\n",
        "def process_file(filing, i):\n",
        "    try:\n",
        "        with open(filing, 'r') as f:\n",
        "            html_content = f.read().replace(\",\", \"\")\n",
        "\n",
        "        patterns = {\n",
        "            'company conformed name': r'company conformed name:\\s*(.*?)\\n',\n",
        "            'filed as of date': r'filed as of date:\\s*(.*?)\\n',\n",
        "            'conformed submission type': r'conformed submission type:\\s*(.*?)\\n',\n",
        "        }\n",
        "        results = {field: (re.search(pattern, html_content, re.IGNORECASE).group(1).strip() if re.search(pattern, html_content, re.IGNORECASE) else None) for field, pattern in patterns.items()}\n",
        "\n",
        "        company_name = (results.get('company conformed name') or \"Unknown Company\").replace(\"/\", \"-\").replace(\"\\\\\", \"-\")\n",
        "        filing_date, submission_type, cik = results.get('filed as of date'), results.get('conformed submission type'), os.path.basename(os.path.dirname(filing))\n",
        "\n",
        "        elements = sp.Edgar10QParser().parse(html_content)\n",
        "        tree = sp.TreeBuilder().build(elements)\n",
        "        best_table = find_table(tree)\n",
        "\n",
        "        if best_table is not None:\n",
        "            filename = f\"{submission_type}_{company_name}_{cik}_{filing_date}_AL.csv\"\n",
        "            best_table.to_csv(os.path.join(\"output_al_final\", filename))\n",
        "            log_print(\"Found Statement of Assets and Liabilities.\")\n",
        "            return 0\n",
        "        else:\n",
        "            log_print(f\"[Table Error]: Statement of Assets and Liabilities not found {filing}\")\n",
        "    except Exception as e:\n",
        "        log_print(f\"[Extraction Error]: Error with {filing}: {e}\")\n",
        "\n",
        "    return 1\n",
        "\n",
        "def filter_files(parent_directory):\n",
        "    subdirectories = [os.path.join(parent_directory, d) for d in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, d))]\n",
        "    total_files = []\n",
        "\n",
        "    for subdir in subdirectories:\n",
        "        files_in_subdir = [\n",
        "            os.path.join(subdir, file)\n",
        "            for file in os.listdir(subdir)\n",
        "            if not (file.startswith(\"8-K\") or file.startswith(\"10-KA\") or file.startswith(\"NT\") or file.startswith(\"10-QA\"))\n",
        "        ]\n",
        "\n",
        "        # Create a dictionary to store the files for each year\n",
        "        files_by_year = {}\n",
        "        for file in files_in_subdir:\n",
        "            # Extract the year from the filename\n",
        "            match = re.search(r'_(\\d{4})-', file)\n",
        "            if match:\n",
        "                year = match.group(1)\n",
        "                if year not in files_by_year:\n",
        "                    files_by_year[year] = []\n",
        "                files_by_year[year].append(file)\n",
        "\n",
        "        for year, files in files_by_year.items():\n",
        "            # Find 10-K files first\n",
        "            ten_k_files = [f for f in files if '10-K_' in f]\n",
        "            if ten_k_files:\n",
        "                total_files.append(ten_k_files[0])\n",
        "            else:\n",
        "                # If no 10-K files, find 10-Q files\n",
        "                ten_q_files = [f for f in files if '10-Q_' in f]\n",
        "                if ten_q_files:\n",
        "                    ten_q_files.sort(key=lambda x: datetime.strptime(re.search(r'_(\\d{4}-\\d{2}-\\d{2})_', x).group(1), '%Y-%m-%d'))\n",
        "                    total_files.append(ten_q_files[-1])\n",
        "\n",
        "    return total_files\n",
        "\n",
        "def main(parent_directory):\n",
        "    total_files = filter_files(parent_directory)\n",
        "\n",
        "    with tqdm(total=len(total_files), file=sys.stdout) as pbar:\n",
        "        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:\n",
        "            futures = {executor.submit(process_file, file, i): file for i, file in enumerate(total_files)}\n",
        "            for future in as_completed(futures, timeout=15):\n",
        "                try:\n",
        "                    future.result()\n",
        "                except Exception as e:\n",
        "                    print(f\"Error in future {futures[future]}: {e}\")\n",
        "                finally:\n",
        "                    pbar.update(1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4241421f-0e0a-4551-9e1a-a6aef7b7e3e8",
      "metadata": {
        "id": "4241421f-0e0a-4551-9e1a-a6aef7b7e3e8"
      },
      "outputs": [],
      "source": [
        "if __name__ == \"__main__\":\n",
        "    try:\n",
        "        os.remove(\"logfile.log\")\n",
        "    except:\n",
        "        pass\n",
        "\n",
        "    os.makedirs(\"output_al_final\", exist_ok=True)\n",
        "\n",
        "    parent_directory = \"/n/holystore01/LABS/jellias_lab/Lab/summer_2023_sandbox/BDC_Data\"\n",
        "    start_time = time.time()\n",
        "\n",
        "    main(parent_directory)\n",
        "\n",
        "    end_time = time.time()\n",
        "    runtime = end_time - start_time\n",
        "    # log_print(f'The runtime of the program is: {runtime} seconds')\n",
        "    print(f'The runtime of the program is: {runtime} seconds')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "986a1a2a-e1f2-48f2-8ff5-18063265e12c",
      "metadata": {
        "id": "986a1a2a-e1f2-48f2-8ff5-18063265e12c"
      },
      "outputs": [],
      "source": [
        "# def filter_files(parent_directory):\n",
        "#     subdirectories = [os.path.join(parent_directory, d) for d in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, d))]\n",
        "#     total_files = []\n",
        "\n",
        "#     for subdir in subdirectories:\n",
        "#         files_in_subdir = [\n",
        "#             os.path.join(subdir, file)\n",
        "#             for file in os.listdir(subdir)\n",
        "#             if not (file.startswith(\"8-K\") or file.startswith(\"10-KA\") or file.startswith(\"NT\") or file.startswith(\"10-QA\"))\n",
        "#         ]\n",
        "\n",
        "#         # Create a dictionary to store the files for each year\n",
        "#         files_by_year = {}\n",
        "#         for file in files_in_subdir:\n",
        "#             # Extract the year from the filename\n",
        "#             match = re.search(r'_(\\d{4})-', file)\n",
        "#             if match:\n",
        "#                 year = match.group(1)\n",
        "#                 if year not in files_by_year:\n",
        "#                     files_by_year[year] = []\n",
        "#                 files_by_year[year].append(file)\n",
        "\n",
        "#         for year, files in files_by_year.items():\n",
        "#             # Find 10-K files first\n",
        "#             ten_k_files = [f for f in files if '10-K_' in f]\n",
        "#             if ten_k_files:\n",
        "#                 total_files.append(ten_k_files[0])\n",
        "#             else:\n",
        "#                 # If no 10-K files, find 10-Q files\n",
        "#                 ten_q_files = [f for f in files if '10-Q_' in f]\n",
        "#                 if ten_q_files:\n",
        "#                     ten_q_files.sort(key=lambda x: datetime.strptime(re.search(r'_(\\d{4}-\\d{2}-\\d{2})_', x).group(1), '%Y-%m-%d'))\n",
        "#                     total_files.append(ten_q_files[-1])\n",
        "#     return total_files\n",
        "\n",
        "# files = filter_files( \"/n/holystore01/LABS/jellias_lab/Lab/summer_2023_sandbox/BDC_Data\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "28498d4c-8cc5-4325-bed2-84a6b28116c4",
      "metadata": {
        "id": "28498d4c-8cc5-4325-bed2-84a6b28116c4"
      },
      "outputs": [],
      "source": [
        "# len([f for f in files if \"10-Q\" in f])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "50773b7b-003f-4952-aa1d-489285e68980",
      "metadata": {
        "id": "50773b7b-003f-4952-aa1d-489285e68980"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "env",
      "language": "python",
      "name": "env"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.3"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}